This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
##all viruses - target and off target
library(tidyverse)
── Attaching core tidyverse packages ──────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2 ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggpubr)
library(tidyplots)
Attaching package: ‘tidyplots’
The following object is masked from ‘package:ggpubr’:
gene_expression
library(ggthemes)
library(scales)
Attaching package: ‘scales’
The following object is masked from ‘package:purrr’:
discard
The following object is masked from ‘package:readr’:
col_factor
#Linear model for TE data - separated by virus
#November 2024
#viral read depth
#use bowtie2 data
library(tidyverse)
library(broom)
library(boot)
####read counts/normalised read counts####
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
####read depths####
#import and combine read depth files
depth_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depth_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)
depths_reads <- left_join(depths_bt_all, metadata2, by = "Sample_id") |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
,
.default = "RNA"
))
####linear model for spike in viruses - mean read depth####
depths_reads_sub <- depths_reads |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
mutate(log_depth = log10(mean_depth)) |>
mutate(log_viral_load = log10(Viral.load))
#inspect data
hist(depths_reads_sub$mean_depth)
hist(depths_reads_sub$Viral.load)
hist(depths_reads_sub$log_depth)
hist(depths_reads_sub$log_viral_load)
summary(depths_reads_sub$log_depth)
summary(depths_reads_sub$log_viral_load)
#all viruses together
lm1 <-
glm(log_depth ~ log_viral_load * virus,
data = depths_reads_sub,
family = "gaussian")
summary(lm1)
glm.diag.plots(lm1)
pred <- predict(lm1, type = "response")
rsq <- function (x, y) {
cor(x, y) ^ 2
}
rsq(depths_reads_sub$log_depth, pred)
##split by viruses
cols <- c("#4477AA",
"#66CCEE",
"#228833",
"#CCBB44",
"#EE6677",
"#AA3377")
facet_names <- c(
"Human_adenovirus_40" = "Human adenovirus 40",
"Human_betaherpesvirus" = "Human betaherpesvirus",
"Human_respiratory_syncytial_virus" = "Human respiratory syncytial virus",
"Influenza_B_virus" = "Influenza B virus",
"Mammalian_orthoreovirus3" = "Mammalian orthoreovirus 3",
"Zika_virus" = "Zika virus"
)
virus <- (unique(depths_reads_sub$virus)) %>%
rep(., each = 2) %>%
data.frame()
list_models <-
depths_reads_sub |>
group_split(virus) |>
map( ~ lm(log_depth ~ log_viral_load, data = .))
lm_tidy <-
map(list_models, broom::tidy) %>%
do.call(rbind.data.frame, .) %>%
cbind(virus, .) %>%
rename(virus = ".") %>%
select(virus, term, estimate) %>%
pivot_wider(names_from = "term", values_from = "estimate")
lm_summary <- depths_reads_sub |>
group_by(virus) |>
summarise(
Intercept = lm(log_depth ~ log_viral_load)$coefficients[1],
Coeff_x1 = lm(log_depth ~ log_viral_load)$coefficients[2],
R2 = summary(lm(log_depth ~ log_viral_load))$r.squared,
pvalue = summary(lm(log_depth ~ log_viral_load))$coefficients["log_viral_load", 4]
)
lm_combined <-
left_join(lm_tidy, lm_summary, by = "virus")
#ggsave("figures/compare_spike_ins_atcc/model_readdepth_dedup.png")
ggsave("figures/manuscript_figure_2025/PDF/Figure_1.pdf",
width = 8,
height = 5)
ggsave("figures/manuscript_figure_2025/PNG/Figure_1.png",
width = 8,
height = 5)
#ATCC genomes updated version
#October 2024
#viral load vs read count normalised using different methods, comparing deduplicated and non-deduplicated
#use bowtie2 data
####read counts/normalised read counts####
#metadata
metadata <- read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
#normalise by both raw read count and genome length - should be the same as mean read depth
counts_reads_norm <- counts_reads |>
mutate(norm_counts1 = matched / QC_reads) |>
mutate(norm_counts2 = matched / QC_reads / length) |>
mutate(norm_counts3 = matched / length) |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
#change labels in facet plots
facet_names <- c("dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated")
cols <- c("#4477AA",
"#66CCEE",
"#228833",
"#CCBB44",
"#EE6677",
"#AA3377")
#plot viral read counts (non normalised)
read_count <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(matched),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
ylab("log10(Viral Reads)") +
xlab("Log (Viral load)") +
scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))
#ggsave("figures/compare_spike_ins_atcc/read_counts.pdf",width=8,height=6)
#normalise by raw read count
read_count_norm1 <- counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(norm_counts1),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
ylab("log10(viral reads/cleaned reads)") +
xlab("Log (Viral load)") +
scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))
#normalise by raw read count and genome length
read_count_norm2 <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = (Viral.load),
y = log10(norm_counts2),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
ylab("Log (viral reads/cleaned reads/genome length)") +
xlab("Log (Viral load)") +
scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))
####read depths####
#import and combine read depth files
depth_dedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
)
depth_nodedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
)
depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)
depths_reads <-
depths_bt_all |>
left_join(metadata2, by = "Sample_id") |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
)) |>
rename(Viral.load = `Viral load`)
read_depths <-
depths_reads |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(mean_depth),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
ylab("Log (mean read depth)") +
xlab("Log (Viral load)") +
scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))
#ggsave("figures/compare_spike_ins_atcc/mean_depth.pdf",width=8,height=6)
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
#normalise by both raw read count and genome length - should be the same as mean read depth
counts_reads_norm <- counts_reads |>
mutate(norm_counts1 = matched / QC_reads) |>
mutate(norm_counts2 = matched / QC_reads / length) |>
mutate(norm_counts3 = matched / length) |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
####compare library capture pool####
cols3 <- c("#228833", "#AA3377")
facet_names <- c("dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated")
metadata3 <-
metadata |>
select(Sample.ID, Pool.for.sequencing) |>
rename(Sample_id = Sample.ID) |>
rename(Pool = Pool.for.sequencing)
counts_pool <- left_join(counts_reads_norm, metadata3, by = "Sample_id")
pool_counts_sum <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log10(matched),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_blank(),
# axis.text.x = element_blank(),
legend.title = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (Viral Reads)") + xlab("Log (Viral load)") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) +
scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
# pool_readcounts_norm <-
# counts_pool |>
# filter(Background != "p6") |>
# filter(Background != "control") |>
# ggplot(aes(x = virus, y = log10(norm_counts1))) +
# geom_boxplot() +
# facet_grid(Pool ~ type) +
# theme_few() +
# theme(
# axis.title.x = element_blank(),
# # axis.text.x = element_blank(),
# axis.title.y = element_text(size = 10),
# panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
# panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
# ) +
# ylab("log10(viral reads/cleaned reads)")
pool_norm1_sum <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log10(matched),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_blank(),
# axis.text.x = element_blank(),
legend.title = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("log10(viral reads/cleaned reads)") + xlab("Log (Viral load)") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) +
scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
# pool_readcounts_norm2 <-
# counts_pool |>
# filter(Background != "p6") |>
# filter(Background != "control") |>
# ggplot(aes(x = virus, y = log10(norm_counts2))) +
# geom_boxplot() +
# facet_grid(Pool ~ type) +
# theme_few() +
# theme(
# axis.title.x = element_blank(),
# # axis.text.x = element_blank(),
# axis.title.y = element_text(size = 10),
# panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
# panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
# ) +
# ylab("log10(viral reads/cleaned reads/genome length)")
pool_norm2_sum <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log10(matched),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_blank(),
# axis.text.x = element_blank(),
legend.title = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("log10(viral reads/cleaned reads/genome length)") + xlab("Log (Viral load)") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) +
scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
#import and combine read depth files
depth_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depth_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)
depths_reads <-
left_join(depths_bt_all, metadata2, by = "Sample_id") |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
depths_pool <- left_join(depths_reads, metadata3, by = "Sample_id")
# pool_readdepths <-
# depths_pool |>
# filter(Background != "p6") |>
# filter(Background != "control") |>
# ggplot(aes(x = virus, y = log10(mean_depth))) +
# geom_boxplot() +
# facet_grid(Pool ~ type) +
# theme_few() +
# theme(
# axis.title.x = element_blank(),
# axis.text.x = element_text(angle = 45, hjust = 1),
# axis.title.y = element_text(size = 10),
# panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
# panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
# ) +
# ylab("Log (mean read depth)")
pool_depths_sum <-
depths_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log10(mean_depth),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_blank(),
# axis.text.x = element_blank(),
legend.title = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (mean read depth)") +
xlab("Log (Viral load)") +
scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5}))) +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))
ggarrange(
pool_counts_sum,
pool_norm1_sum,
pool_norm2_sum,
pool_depths_sum,
nrow = 2,
ncol = 2,
common.legend = TRUE,
align = "hv"
)
Error: object 'pool_counts_sum' not found
##plots of read counts and viral read counts
#November 2024
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import read count files
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
#import viral reads mapped (to calculate proportions)
viral_reads_dedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
header = TRUE
)
viral_reads_nodedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
header = TRUE
)
cols2 <- c("#BB5566", "#004488")
facet_names <- c("dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated")
reads_metadata_dedup <-
left_join(counts_dedup_bt, metadata2, by = "Sample_id")
reads_viral_dedup <-
left_join(reads_metadata_dedup, viral_reads_dedup, by = "Sample_id")
reads_metadata_nodedup <-
left_join(counts_nodedup_bt, metadata2, by = "Sample_id")
reads_viral_nodedup <-
left_join(reads_metadata_nodedup, viral_reads_nodedup, by = "Sample_id")
reads_viral_all <- rbind(reads_viral_dedup, reads_viral_nodedup)
reads_plot <-
reads_viral_all |>
group_by(Background, Sample_id, Viral.load, type) |>
summarise(
total_reads = (QC_reads * 2),
viral_reads = total_virus_reads,
ATCC_reads = sum(matched),
prop_ATCC = ATCC_reads / total_reads,
prop_viral = total_virus_reads / total_reads,
diff = viral_reads - ATCC_reads
) |>
unique()
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
Please use `reframe()` instead.
When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.`summarise()` has grouped output by 'Background', 'Sample_id', 'Viral.load', 'type'. You can override using the `.groups` argument.
####read counts/depths split by background####
#change labels in facet plots
facet_names_bg <- c(
"dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated",
"p2" = "P1",
"p8" = "P2"
)
#break down by Background x virus
background_counts_reads <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(matched),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels =
c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(labels = scales::label_log10()) +
ylab("log10(Viral Reads)")
backgrounds_counts_reads_norm <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(norm_counts1),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_bw() +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(label = scales::label_log10()) +
ylab("log10(viral reads/cleaned reads)")
backgrounds_counts_reads_norm2 <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(norm_counts2),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(labels = scales::label_log10()) +
ylab("log10(viral reads/cleaned reads/genome length)")
background_read_depths <-
depths_reads |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(mean_depth),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_few() +
theme(
axis.title.x = element_blank(),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
axis.title.y = element_text(size = 10),
legend.title = element_blank()
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(label = scales::label_log10()) +
ylab("log10(mean read depth)")
#plots of TE experiment data - genome coverage & per site coverage
#also separated by viruses
#ATCC genomes updated version
#October 2024
#use bowtie2 data
####genome coverage####
unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_TE/")
unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv.zip", exdir = "data_TE/")
dedup_per_site <-
read_tsv(
"data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
)
nodedup_per_site <-
read_tsv(
"data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
)
file.remove("data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")
file.remove( "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv")
#add length column from read depth file to per site file
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <-
metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
)
counts_nodedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
#normalise by both raw read count and genome length - should be the same as mean read depth
counts_reads_norm <-
counts_reads |>
mutate(norm_counts1 = matched / QC_reads) |>
mutate(norm_counts2 = matched / QC_reads / length) |>
mutate(norm_counts3 = matched / length) |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
lengths <-
counts_reads_norm |>
select(virus, length) |>
distinct()
#calculate genome coverage
persite_coverage_dedup <-
dedup_per_site |>
rename(Viral.load = `Viral load`) |>
group_by(Sample_id, virus, Viral.load, Background, type) |>
filter(coverage > 0) |>
summarise(genome_coverage = n()) |>
left_join(lengths) |>
mutate(percent_coverage = genome_coverage / length)
persite_coverage_nodedup <-
nodedup_per_site |>
rename(Viral.load = `Viral load`) |>
group_by(Sample_id, virus, Viral.load, Background, type) |>
filter(coverage > 0) |>
summarise(genome_coverage = n()) |>
left_join(lengths) |>
mutate(percent_coverage = genome_coverage / length)
persite_coverage_both <-
rbind(persite_coverage_dedup,
persite_coverage_nodedup)
coverage_labels <- c("0" = "Control",
"100" = "10^{2}",
"1000" = "10^{3}",
"10000" = "10^{5}",
"dedup_TE" = "Deduplicated",
"nodedup_TE"= "Non-Deduplicated")
coverage_labels2 <- c("0" = "Control",
"100" = "10^{2}",
"1000" = "10^{3}",
"10000" = "10^{5}")
ggsave("figures/manuscript_figure_2025/PDF/Figure_S4.pdf",
width=15,
height=10)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S4.png",
width=15,
height=10)
####Final plot
file.remove("figures/manuscript_figure_2025/PDF/FigureS6.pdf")
[1] TRUE
####Final plot
file.remove("figures/manuscript_figure_2025/PNG/Figure_S6.pdf")
[1] TRUE
HBV <-
persite_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
filter(virus == "Human_betaherpesvirus") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(x = site, y = coverage, fill = Background)) +
geom_col() +
facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
ylab("Coverage") +
ggtitle("Human Betaherpesvirus (deduplicated)") +
guides(fill = guide_legend(title = "Background")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols2, labels = c("P1", "P2"))
####Final plot
file.remove("figures/manuscript_figure_2025/PNG/Figure_S7.pdf")
[1] TRUE
#have to do these differently due to segmentation
FLU_ME1 <- persite_norm |>
filter(virus == "Influenza_B_virus") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Coverage") +
ggtitle("Influenza B virus (Background 1 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))
FLU_ME2 <- persite_norm |>
filter(virus == "Influenza_B_virus") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Coverage") +
ggtitle("Influenza B virus (Background 2 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))
ggarrange(FLU_ME1, FLU_ME2, ncol = 2, common.legend = TRUE)
####Final plot
ggsave("figures/manuscript_figure_2025/PDF/Figure_S8.pdf",
width=20,
height=12,
dpi=600)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S8.png",
width=20,
height=12)
REO_ME1 <-
persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("log10(Coverage)") +
ggtitle("Mammalian orthoreovirus 3 (Background 1 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols4,
label = c(
expression("10" ^ "2"),
expression("10" ^ "3"),
expression("10" ^ "5")
))
REO_ME2 <-
persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p8") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("log10(Coverage)") +
ggtitle("Mammalian orthoreovirus 3 (Background 2 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
) + scale_y_log10() +
scale_fill_manual(values = cols4,
label = c(
expression("10" ^ "2"),
expression("10" ^ "3"),
expression("10" ^ "5")
))
ggarrange(REO_ME1, REO_ME2, ncol = 2, common.legend = TRUE)
####Final plot
ggsave("figures/manuscript_figure_2025/PDF/Figure_S9.pdf",
width=20,
height=12,
dpi=600)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S9.png",
width=20,
height=12,
dpi=600)
metadata2 <-
metadata |>
select(
Sample.ID,
Background.sample,
Viral.load,
Number.of.read.pairs..quality.adaptor.trimmed.
) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
Error in rename(rename(select(metadata, Sample.ID, Background.sample, :
could not find function "rename"